Use biomod2 to run single models, ensemble test and
final models.
# Load necessary libraries
#if statement to automatically install libraries if absent in r library
#tidyverse - mainly for data wrangling & plotting/mapping
if (!requireNamespace("tidyverse", quietly = TRUE)) {
install.packages("tidyverse")
}
library(tidyverse)
#terra - spatial package
if (!requireNamespace("terra", quietly = TRUE)) {
install.packages("terra")
}
require(terra)
#tidyterra - spatial package for mapping
if (!requireNamespace("tidyterra", quietly = TRUE)) {
install.packages("tidyterra")
}
require(tidyterra)
#biomod2 - ensemble modeling
if (!requireNamespace("biomod2", quietly = TRUE)) {
install.packages("biomod2")
}
require(biomod2)
#earth - ensemble modeling - For MARS model
if (!requireNamespace("earth", quietly = TRUE)) {
install.packages("earth")
}
require(earth)
#randomForest - ensemble modeling - For RF model
if (!requireNamespace("randomForest", quietly = TRUE)) {
install.packages("randomForest")
}
require(randomForest)
#ggtext - for biomod2 plotting
if (!requireNamespace("ggtext", quietly = TRUE)) {
install.packages("ggtext")
}
require(ggtext)
Directories
#ESIIL Macrophenology group - for saving
main.dir = "G:/Shared drives/ESIIL_Macrophenology/Across_sp_SDM"
L2 = file.path(main.dir, "Data/L2")
#ensemble folder for phenology
L2.em = file.path(L2, "ensemble/Phenology")
Data
#load biomod objects
load(file.path(L2, "phenology_timeperiods_cleaned.RData"), verbose = TRUE)
## Loading objects:
## phe.hw
## phe.cw
#read climate rasters
clim.h = rast(file.path(L2, #object saving directory
"ClimRastS_H_cropped.tif"))
clim.c = rast(file.path(L2, #object saving directory
"ClimRastS_C_cropped.tif"))
biomod2 formattingWe will be modeling the phenology for each species individually. We
will filter for each species and then format the data for
biomod2.
Note (03/21/2024): Focusing on Acer rubrum to start.
#Historical
#Select species name
myRespName.h = 'Acer rubrum'
# Get corresponding P/A data
myResp.h = phe.hw[, myRespName.h]
#Get corresponding coordinates
myRespXY.h = as.data.frame(phe.hw[, c('longitude', 'latitude')]) #Make sure long goes first!
Format data with true absences in biomod2
#Historical
bmdat.h = BIOMOD_FormatingData(
resp.name = myRespName.h, #species name
resp.var = myResp.h, #presences-absences
resp.xy = myRespXY.h, #lat/lon
expl.var = clim.h #raster stack
)
##
## -=-=-=-=-=-=-=-=-=-=-=-=-= Acer rubrum Data Formating -=-=-=-=-=-=-=-=-=-=-=-=-=
##
## ! Response variable name was converted into Acer.rubrum
## !!! Some data are located in the same raster cell.
## Please set `filter.raster = TRUE` if you want an automatic filtering.
## ! No data has been set aside for modeling evaluation
## -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Done -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#save object
# save(bmdat.h, file = file.path(L2, "bmdat_h.RData"))
#biomod data summary
print(bmdat.h)
##
## -=-=-=-=-=-=-=-=-=-=-=-=-=-= BIOMOD.formated.data -=-=-=-=-=-=-=-=-=-=-=-=-=-=
##
## dir.name = .
##
## sp.name = Acer.rubrum
##
## 76 presences, 382 true absences and 322 undefined points in dataset
##
##
## 4 explanatory variables
##
## ppt.avg ppt.anom temp.avg temp.anom
## Min. : 49.11 Min. :-11.044 Min. : 2.590 Min. :-0.62583
## 1st Qu.: 88.12 1st Qu.: -4.871 1st Qu.: 8.526 1st Qu.:-0.24629
## Median : 92.31 Median : -3.626 Median :10.132 Median :-0.09984
## Mean : 94.10 Mean : -3.506 Mean :10.627 Mean :-0.11226
## 3rd Qu.: 98.00 3rd Qu.: -2.273 3rd Qu.:11.900 3rd Qu.: 0.02024
## Max. :174.93 Max. : 6.887 Max. :22.196 Max. : 0.58538
##
## -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#spatial description of data
plot(bmdat.h)
## $data.vect
## class : SpatVector
## geometry : points
## dimensions : 458, 2 (geometries, attributes)
## extent : -95.93779, -67.98735, 27.01781, 47.81878 (xmin, xmax, ymin, ymax)
## coord. ref. :
## names : resp dataset
## type : <num> <chr>
## values : 10 Initial dataset
## 10 Initial dataset
## 10 Initial dataset
##
## $data.label
## 9 10
## "**Presences**" "Presences (calibration)"
## 11 12
## "Presences (validation)" "Presences (evaluation)"
## 19 20
## "**True Absences**" "True Absences (calibration)"
## 21 22
## "True Absences (validation)" "True Absences (evaluation)"
## 29 30
## "**Pseudo-Absences**" "Pseudo-Absences (calibration)"
## 31 1
## "Pseudo-Absences (validation)" "Background"
##
## $data.plot
#Historic
# k-fold selection
cv.k.h <- bm_CrossValidation(bm.format = bmdat.h, #Formatted biomod data
strategy = "kfold", #validation strategy - various
nb.rep = 2, #number of repitions
k = 3) # number of split datasets of equivalent sizes
##
##
## Checking Cross-Validation arguments...
##
## > k-fold cross-validation selection
## !!! Some calibration dataset do not have both presences and absences: _allData_RUN1, _allData_RUN2, _allData_RUN3, _allData_RUN4, _allData_RUN5, _allData_RUN6
## Warning in bm_CrossValidation(bm.format = bmdat.h, strategy = "kfold", nb.rep =
## 2, : Some calibration repetion do not have both presences and absences
##
## !!! Some validation dataset do not have both presences and absences: _allData_RUN1, _allData_RUN2, _allData_RUN3, _allData_RUN4, _allData_RUN5, _allData_RUN6
## Warning in bm_CrossValidation(bm.format = bmdat.h, strategy = "kfold", nb.rep =
## 2, : Some validation repetion do not have both presences and absences
head(cv.k.h)
## _allData_RUN1 _allData_RUN2 _allData_RUN3 _allData_RUN4 _allData_RUN5
## [1,] TRUE FALSE TRUE FALSE TRUE
## [2,] TRUE FALSE TRUE FALSE TRUE
## [3,] FALSE TRUE TRUE FALSE TRUE
## [4,] TRUE TRUE FALSE TRUE FALSE
## [5,] TRUE TRUE FALSE TRUE FALSE
## [6,] TRUE FALSE TRUE FALSE TRUE
## _allData_RUN6
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
summary(bmdat.h, calib.lines = cv.k.h)
## dataset run PA Presences True_Absences Pseudo_Absences Undefined
## 1 initial <NA> <NA> 76 382 0 322
## 2 calibration RUN1 allData 51 249 220 NA
## 3 validation RUN1 allData 25 133 102 NA
## 4 calibration RUN2 allData 50 253 217 NA
## 5 validation RUN2 allData 26 129 105 NA
## 6 calibration RUN3 allData 51 262 207 NA
## 7 validation RUN3 allData 25 120 115 NA
## 8 calibration RUN4 allData 51 244 225 NA
## 9 validation RUN4 allData 25 138 97 NA
## 10 calibration RUN5 allData 50 266 204 NA
## 11 validation RUN5 allData 26 116 118 NA
## 12 calibration RUN6 allData 51 254 215 NA
## 13 validation RUN6 allData 25 128 107 NA
plot(bmdat.h, calib.lines = cv.k.h)
## $data.vect
## class : SpatVector
## geometry : points
## dimensions : 5138, 2 (geometries, attributes)
## extent : -95.93779, -67.85529, 27.01781, 47.81878 (xmin, xmax, ymin, ymax)
## coord. ref. :
## names : resp dataset
## type : <num> <chr>
## values : 10 Initial dataset
## 10 Initial dataset
## 10 Initial dataset
##
## $data.label
## 9 10
## "**Presences**" "Presences (calibration)"
## 11 12
## "Presences (validation)" "Presences (evaluation)"
## 19 20
## "**True Absences**" "True Absences (calibration)"
## 21 22
## "True Absences (validation)" "True Absences (evaluation)"
## 29 30
## "**Pseudo-Absences**" "Pseudo-Absences (calibration)"
## 31 1
## "Pseudo-Absences (validation)" "Background"
##
## $data.plot
#get today's date
# today = format(Sys.Date(), format = "%Y%m%d")
#
# #save as pdf
# pdf(file.path(L2, #object saving file directory
# paste0("bmdat_cv_k_h_", today, ".pdf")))
#
# try(plot(bmdat.h, calib.lines = cv.k.h))
#
# dev.off()
Some other ways below
# # stratified selection (geographic)
# cv.s <- bm_CrossValidation(bm.format = bmdat.h,
# strategy = "strat",
# k = 2,
# balance = "presences",
# strat = "x")
# head(cv.s)
see site
Model options:
| model | type | model descriptions | package | |
|---|---|---|---|---|
| 1 | ANN | binary | Fit Neural Networks | nnet |
| 2 | CTA | binary | Recursive Partitioning and Regression Trees | rpart |
| 3 | FDA | binary | Flexible Discriminant Analysis | mda |
| 4 | GAM | binary | General Additive Model | gam |
| 5 | GAM | binary | General Additive Model | mgcv |
| 6 | GAM | binary | General Additive Model | mgcv |
| 7 | GBM | binary | Generalized Boosted Regression Modeling (GBM) | gbm |
| 8 | GLM | binary | General Linear Model | stats |
| 9 | MARS | binary | Multivariate Adaptive Regression Splines | earth |
| 10 | MAXENT | binary | Maximum Entropy | MAXENT |
| 11 | MAXNET | binary | Maximum Entropy | manet |
| 12 | RF | binary | Random Forest | randomForest |
| 13 | SRE | binary | Surface Range Envelope | biomod2 |
| 14 | XGBOOST | binary | eXtreme Gradient Boosting Training | xboost |
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Need to dowload the packages for the desired models before running models - see table above
mods = c("CTA", "GBM", "GLM", "MARS", "RF", "SRE")
# Model single models
myBiomodModelOut.h <- BIOMOD_Modeling(bm.format = bmdat.h, #formatted biomod data
models = mods, #for simulation set
CV.strategy = 'kfold', #cross-validation strategy
CV.nb.rep = 2, #cross-val repititions
CV.k = 3, #cross-val partitions
OPT.strategy = 'bigboss', #model param selection
var.import = 3, #number of permutations to est var importance
seed.val = 1, #to keep same results when rerunning
# nb.cpu = 8), #computing resources
metric.eval = c('TSS','ROC')) #evaluation metrics
## Warning in bm_CrossValidation(bm.format = bm.format, strategy = CV.strategy, :
## Some calibration repetion do not have both presences and absences
## Warning in bm_CrossValidation(bm.format = bm.format, strategy = CV.strategy, :
## Some validation repetion do not have both presences and absences
## Warning: executing %dopar% sequentially: no parallel backend registered
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#view output
myBiomodModelOut.h
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Get evaluation scores & variables importance - uncomment to see
# get_evaluations(myBiomodModelOut.h)
# get_variables_importance(myBiomodModelOut.h)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent evaluation scores & variables importance
bm_PlotEvalMean(bm.out = myBiomodModelOut.h)
bm_PlotEvalBoxplot(bm.out = myBiomodModelOut.h, group.by = c('algo', 'algo'))
bm_PlotEvalBoxplot(bm.out = myBiomodModelOut.h, group.by = c('algo', 'run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.h, group.by = c('expl.var', 'algo', 'algo'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.h, group.by = c('expl.var', 'algo', 'run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.h, group.by = c('algo', 'expl.var', 'run'))
Seems like Random Forest (RF) was the best, followed by Generalised Boosted Models (GBM), and then CTA.
Overall, ppt.avg was the most important variable across all models
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent response curves
bm_PlotResponseCurves(bm.out = myBiomodModelOut.h,
models.chosen = get_built_models(myBiomodModelOut.h)[c(1:3, 12:14)], fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodModelOut.h,
models.chosen = get_built_models(myBiomodModelOut.h)[c(1:3, 12:14)], fixed.var = 'min')
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Response Curves heat map
bm_PlotResponseCurves(bm.out = myBiomodModelOut.h,
models.chosen = get_built_models(myBiomodModelOut.h)[3],
fixed.var = 'median',
do.bivariate = TRUE)
Make a vector of the best scoring models
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
## choose high scoring models (calibration)
modeleval<-myBiomodModelOut.h@models.evaluation
#Create a new data frame with stuff
modelevaldataset<- data.frame(modeleval@val[["full.name"]], modeleval@val[["algo"]], modeleval@val[["metric.eval"]], modeleval@val[["validation"]], modeleval@val[["calibration"]])
#Filter based on a TSS threshold
bestmodelscal <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS") %>%
filter(modeleval.val...calibration... >= 0.7) # select models that had TSS over 0.6, done by Carroll et al.
# upped to 0.7 b/c why not? more robust?
#Peep into the remaining models
unique(bestmodelscal$modeleval.val...algo...)
## [1] "GBM" "RF" "CTA"
#Plot
ggplot(bestmodelscal)+
geom_col(mapping = aes(x = modeleval.val...algo..., y = modeleval.val...calibration...))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
##choose high scoring models (validation)
#Histrogram
hist(modelevaldataset$modeleval.val...validation...) # some do hit over 0.7. Oh, but this includes everything..
#Pull only the TSS metric
TSS_val <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS")
#Histrogram
hist(TSS_val$modeleval.val...validation...) # max is just above 0.6
bestmodelsval <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS") %>%
filter(modeleval.val...validation... > 0.5) # select models that had TSS over 0.5, lower than Carroll but i think she used calibration data
# upped to 0.7 b/c why not? more robust?
#Peep into the remaining models
unique(bestmodelsval$modeleval.val...algo...)
## character(0)
#Plot
ggplot(bestmodelsval)+
geom_col(mapping = aes(x = modeleval.val...algo..., y = modeleval.val...calibration...))
Aw dang, well we are intreasted in predicting what has already happened and not in new places – using to predict
Let’s save the best models from the calibration!
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
## best models
bestmods <- c("CTA", "GBM", "RF")
#calibration
filt.cal<- bestmodelscal %>%
filter(modeleval.val...algo...%in% bestmods)
#validation
filt.val <- bestmodelsval %>%
filter(modeleval.val...algo...%in% bestmods)
#full names of best models
bestmodsfullnames.h = c(filt.cal$modeleval.val...full.name..., filt.val$modeleval.val...full.name...)
bestmodsfullnames.h
## [1] "Acer.rubrum_allData_RUN1_GBM" "Acer.rubrum_allData_RUN1_RF"
## [3] "Acer.rubrum_allData_RUN2_RF" "Acer.rubrum_allData_RUN3_CTA"
## [5] "Acer.rubrum_allData_RUN3_RF" "Acer.rubrum_allData_RUN4_RF"
## [7] "Acer.rubrum_allData_RUN5_GBM" "Acer.rubrum_allData_RUN5_RF"
## [9] "Acer.rubrum_allData_RUN6_CTA" "Acer.rubrum_allData_RUN6_RF"
## [11] "Acer.rubrum_allData_allRun_RF"
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Model ensemble models
myBiomodEM.h <- BIOMOD_EnsembleModeling(bm.mod = myBiomodModelOut.h, #singles model output
models.chosen = bestmodsfullnames.h, #vector of best models
em.by = 'all', #what models will be combined to ensemble
em.algo = c('EMmean', 'EMcv', 'EMci', 'EMmedian', 'EMca', 'EMwmean'), #types of ensembles models to be computed
metric.select = c('TSS'),
# metric.select.thresh = c(0.7), #don't need this bc double filtering
metric.eval = c('TSS', 'ROC'), #evaluation metrics to filter models
var.import = 3, #num permutationsto est var importance
EMci.alpha = 0.05, #significance level
EMwmean.decay = 'proportional') #relative importance of weights
#Print results
myBiomodEM.h
#get today's date - uncomment to save
# today = format(Sys.Date(), format = "%Y%m%d")
#
# ##save biomodout file
# saveRDS(myBiomodEM.h, file = file.path(L2, myBiomodEM.h@sp.name,
# paste0("biomodensemble_Acer_rubrum_hist_out_", today, ".rds")))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Get evaluation scores & variables importance - can save these as csv
evalensemble <- get_evaluations(myBiomodEM.h)
varimpensemble <- get_variables_importance(myBiomodEM.h)
#Extract kept models in the ensemble
kept_models <- get_kept_models(myBiomodEM.h)
print(kept_models)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent evaluation scores & variables importance
bm_PlotEvalMean(bm.out = myBiomodEM.h, group.by = 'full.name')
## Warning in .bm_PlotEvalMean.check.args(bm.out, metric.eval, dataset, group.by,
## : ROC, TSS evaluation metric.eval automatically selected
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
bm_PlotEvalBoxplot(bm.out = myBiomodEM.h, group.by = c('full.name', 'full.name'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.h, group.by = c('expl.var', 'full.name', 'full.name'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.h, group.by = c('expl.var', 'algo', 'merged.by.run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.h, group.by = c('algo', 'expl.var', 'merged.by.run'))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Response cirves heat map
bm_PlotResponseCurves(bm.out = myBiomodEM.h,
models.chosen = get_built_models(myBiomodEM.h)[7],
fixed.var = 'median',
do.bivariate = TRUE)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent response curves
bm_PlotResponseCurves(bm.out = myBiomodEM.h,
models.chosen = get_built_models(myBiomodEM.h)[c(1, 6, 7)],
fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodEM.h,
models.chosen = get_built_models(myBiomodEM.h)[c(1, 5, 6)],
fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodEM.h,
models.chosen = get_built_models(myBiomodEM.h)[c(1, 6, 7)],
fixed.var = 'min')
Projecting across space using initial ensemble outputs and pre-moeling data.
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# project species - will pick best one
sp_projection.h <- BIOMOD_EnsembleForecasting(myBiomodEM.h, #output from ensemble
proj.name = myRespName.h, #species name
new.env = clim.h, #enviro matrix
new.env.xy = myRespXY.h,
models.chosen = "all")
plot(sp_projection.h)
#spits lots of rasters
#Saving plots - uncomment to save
# png(file = file.path(L2, paste0("EMFProj_all_Historical_", today, ".png")), width=8, height=7, units="in", res=600)
# try(plot(sp_projection.h))
# dev.off()
Choose the ensemble model (one) that looks the ‘best’ to you and re-run on its own - Here we chose the EMca by TSS
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
EMcaTSS.h <- BIOMOD_EnsembleForecasting(myBiomodEM.h,
proj.name = myRespName.h,
new.env = clim.h,
new.env.xy = myRespXY.h,
models.chosen = "Acer.rubrum_EMcaByTSS_mergedData_mergedRun_mergedAlgo") #EMca by TSS run - to choose how to ensemble them #spits one raster
plot(EMcaTSS.h)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#read raster output
r = rast(EMcaTSS.h@proj.out@link)
#write ouput with new name -- will be overwritten by the current timeperiod model since the species share the same name
writeRaster(r,
file = file.path(L2, paste0(
gsub("\\.", "_", EMcaTSS.h@sp.name),
"_FinalEnsemble_H_Phenology.tif")), overwrite = TRUE)
biomod2 formatting#Current
#Select species name
myRespName.c = "Acer rubrum"
# Get corresponding P/A data
myResp.c = phe.cw[, myRespName.c]
#Get corresponding coordinates
myRespXY.c = as.data.frame(phe.cw[, c('longitude', 'latitude')]) #Make sure long goes first!
Format data with true absences in biomod2
#Historical
bmdat.c = BIOMOD_FormatingData(
resp.name = myRespName.c, #species name
resp.var = myResp.c, #presences-absences
resp.xy = myRespXY.c, #lat/lon
expl.var = clim.c #raster stack
)
##
## -=-=-=-=-=-=-=-=-=-=-=-=-= Acer rubrum Data Formating -=-=-=-=-=-=-=-=-=-=-=-=-=
##
## ! Response variable name was converted into Acer.rubrum
## !!! Some data are located in the same raster cell.
## Please set `filter.raster = TRUE` if you want an automatic filtering.
## ! No data has been set aside for modeling evaluation
## -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Done -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#save object
# save(bmdat.c, file = file.path(L2, "bmdat_c.RData"))
#biomod data summary
print(bmdat.c)
##
## -=-=-=-=-=-=-=-=-=-=-=-=-=-= BIOMOD.formated.data -=-=-=-=-=-=-=-=-=-=-=-=-=-=
##
## dir.name = .
##
## sp.name = Acer.rubrum
##
## 487 presences, 1059 true absences and 851 undefined points in dataset
##
##
## 4 explanatory variables
##
## ppt.avg ppt.anom temp.avg temp.anom
## Min. : 51.25 Min. :-7.370 Min. : 2.532 Min. :-0.46648
## 1st Qu.: 90.63 1st Qu.: 1.554 1st Qu.: 8.706 1st Qu.:-0.03283
## Median : 98.03 Median : 3.521 Median :11.379 Median : 0.09753
## Mean :101.28 Mean : 3.167 Mean :12.114 Mean : 0.11260
## 3rd Qu.:108.64 3rd Qu.: 5.097 3rd Qu.:15.488 3rd Qu.: 0.25386
## Max. :173.19 Max. :13.467 Max. :22.835 Max. : 0.66756
##
## -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#spatial description of data
plot(bmdat.c)
## $data.vect
## class : SpatVector
## geometry : points
## dimensions : 1546, 2 (geometries, attributes)
## extent : -95.29245, -67.1107, 26.9611, 47.98861 (xmin, xmax, ymin, ymax)
## coord. ref. :
## names : resp dataset
## type : <num> <chr>
## values : 10 Initial dataset
## 10 Initial dataset
## 10 Initial dataset
##
## $data.label
## 9 10
## "**Presences**" "Presences (calibration)"
## 11 12
## "Presences (validation)" "Presences (evaluation)"
## 19 20
## "**True Absences**" "True Absences (calibration)"
## 21 22
## "True Absences (validation)" "True Absences (evaluation)"
## 29 30
## "**Pseudo-Absences**" "Pseudo-Absences (calibration)"
## 31 1
## "Pseudo-Absences (validation)" "Background"
##
## $data.plot
#Current
# k-fold selection
cv.k.c <- bm_CrossValidation(bm.format = bmdat.c, #Formatted biomod data
strategy = "kfold", #validation strategy - various
nb.rep = 2, #number of repitions
k = 3) # number of split datasets of equivalent sizes
##
##
## Checking Cross-Validation arguments...
##
## > k-fold cross-validation selection
## !!! Some calibration dataset do not have both presences and absences: _allData_RUN1, _allData_RUN2, _allData_RUN3, _allData_RUN4, _allData_RUN5, _allData_RUN6
## Warning in bm_CrossValidation(bm.format = bmdat.c, strategy = "kfold", nb.rep =
## 2, : Some calibration repetion do not have both presences and absences
##
## !!! Some validation dataset do not have both presences and absences: _allData_RUN1, _allData_RUN2, _allData_RUN3, _allData_RUN4, _allData_RUN5, _allData_RUN6
## Warning in bm_CrossValidation(bm.format = bmdat.c, strategy = "kfold", nb.rep =
## 2, : Some validation repetion do not have both presences and absences
head(cv.k.c)
## _allData_RUN1 _allData_RUN2 _allData_RUN3 _allData_RUN4 _allData_RUN5
## [1,] FALSE TRUE TRUE TRUE FALSE
## [2,] FALSE TRUE TRUE FALSE TRUE
## [3,] FALSE TRUE TRUE TRUE TRUE
## [4,] FALSE TRUE TRUE TRUE TRUE
## [5,] FALSE TRUE TRUE TRUE FALSE
## [6,] TRUE FALSE TRUE FALSE TRUE
## _allData_RUN6
## [1,] TRUE
## [2,] TRUE
## [3,] FALSE
## [4,] FALSE
## [5,] TRUE
## [6,] TRUE
summary(bmdat.c, calib.lines = cv.k.c)
## dataset run PA Presences True_Absences Pseudo_Absences Undefined
## 1 initial <NA> <NA> 487 1059 0 851
## 2 calibration RUN1 allData 325 718 555 NA
## 3 validation RUN1 allData 162 341 296 NA
## 4 calibration RUN2 allData 324 705 569 NA
## 5 validation RUN2 allData 163 354 282 NA
## 6 calibration RUN3 allData 325 695 578 NA
## 7 validation RUN3 allData 162 364 273 NA
## 8 calibration RUN4 allData 325 711 562 NA
## 9 validation RUN4 allData 162 348 289 NA
## 10 calibration RUN5 allData 324 713 561 NA
## 11 validation RUN5 allData 163 346 290 NA
## 12 calibration RUN6 allData 325 694 579 NA
## 13 validation RUN6 allData 162 365 272 NA
plot(bmdat.c, calib.lines = cv.k.c)
## $data.vect
## class : SpatVector
## geometry : points
## dimensions : 15928, 2 (geometries, attributes)
## extent : -95.50437, -67.1107, 26.9611, 47.98861 (xmin, xmax, ymin, ymax)
## coord. ref. :
## names : resp dataset
## type : <num> <chr>
## values : 10 Initial dataset
## 10 Initial dataset
## 10 Initial dataset
##
## $data.label
## 9 10
## "**Presences**" "Presences (calibration)"
## 11 12
## "Presences (validation)" "Presences (evaluation)"
## 19 20
## "**True Absences**" "True Absences (calibration)"
## 21 22
## "True Absences (validation)" "True Absences (evaluation)"
## 29 30
## "**Pseudo-Absences**" "Pseudo-Absences (calibration)"
## 31 1
## "Pseudo-Absences (validation)" "Background"
##
## $data.plot
#get today's date - uncomment if you want to save
# today = format(Sys.Date(), format = "%Y%m%d")
#
# #save as pdf
# pdf(file.path(L2, #object with file directory
# paste0("bmdat_cv_k_c_", today, ".pdf")))
#
# try(plot(bmdat.c, calib.lines = cv.k.c))
#
# dev.off()
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Models of interest
mods = c("CTA", "GBM", "GLM", "MARS", "RF", "SRE")
# Model single models
myBiomodModelOut.c <- BIOMOD_Modeling(bm.format = bmdat.c, #formatted biomod data
models = mods, #for simulation set
CV.strategy = 'kfold', #cross-validation strategy
CV.nb.rep = 2, #cross-val repititions
CV.k = 3, #cross-val partitions
OPT.strategy = 'bigboss', #model param selection
var.import = 3, #number of permutations to est var importance
seed.val = 2, #to keep same results when rerunning
# nb.cpu = 8) #computing resources
metric.eval = c('TSS','ROC')) #evaluation metrics
## Warning in bm_CrossValidation(bm.format = bm.format, strategy = CV.strategy, :
## Some calibration repetion do not have both presences and absences
## Warning in bm_CrossValidation(bm.format = bm.format, strategy = CV.strategy, :
## Some validation repetion do not have both presences and absences
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#view output
myBiomodModelOut.c
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Get evaluation scores & variables importance -- uncomment to see
# get_evaluations(myBiomodModelOut.c)
# get_variables_importance(myBiomodModelOut.c)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent evaluation scores & variables importance
bm_PlotEvalMean(bm.out = myBiomodModelOut.c)
## Warning in .bm_PlotEvalMean.check.args(bm.out, metric.eval, dataset, group.by,
## : ROC, TSS evaluation metric.eval automatically selected
bm_PlotEvalBoxplot(bm.out = myBiomodModelOut.c, group.by = c('algo', 'algo'))
bm_PlotEvalBoxplot(bm.out = myBiomodModelOut.c, group.by = c('algo', 'run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.c, group.by = c('expl.var', 'algo', 'algo'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.c, group.by = c('expl.var', 'algo', 'run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodModelOut.c, group.by = c('algo', 'expl.var', 'run'))
Seems like Random Forest (RF) was the best, followed by Generalised Boosted Models (GBM), and then CTA.
Overall, ppt.avg was the most important variable across all models, followed by temp.avg
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent response curves
bm_PlotResponseCurves(bm.out = myBiomodModelOut.c,
models.chosen = get_built_models(myBiomodModelOut.c)[c(1:3, 12:14)], fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodModelOut.c,
models.chosen = get_built_models(myBiomodModelOut.c)[c(1:3, 12:14)], fixed.var = 'min')
bm_PlotResponseCurves(bm.out = myBiomodModelOut.c,
models.chosen = get_built_models(myBiomodModelOut.c)[3],
fixed.var = 'median',
do.bivariate = TRUE)
Make a vector of the best models
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
## choose high scoring models (calibration)
modeleval<-myBiomodModelOut.c@models.evaluation
#Create a new data frame with stuff
modelevaldataset<- data.frame(modeleval@val[["full.name"]], modeleval@val[["algo"]], modeleval@val[["metric.eval"]], modeleval@val[["validation"]], modeleval@val[["calibration"]])
#Filter based on a TSS threshold
bestmodelscal <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS") %>%
filter(modeleval.val...calibration... >= 0.7) # select models that had TSS over 0.6, done by Carroll et al. (ASK JILL FOR CITATION)
# upped to 0.7 b/c why not? more robust?
#Peep into the remaining models
unique(bestmodelscal$modeleval.val...algo...)
## [1] "RF"
#Plot
ggplot(bestmodelscal)+
geom_col(mapping = aes(x = modeleval.val...algo..., y = modeleval.val...calibration...))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
##choose high scoring models (validation)
#Histrogram
hist(modelevaldataset$modeleval.val...validation...) # some do hit over 0.7. Oh, but this includes everything..
#Pull only the TSS metric
TSS_val <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS")
#Histrogram
hist(TSS_val$modeleval.val...validation...) # max is just above 0.6
bestmodelsval <- modelevaldataset %>%
filter(modeleval.val...metric.eval...== "TSS") %>%
filter(modeleval.val...validation... > 0.5) # select models that had TSS over 0.5, lower than Carroll but i think she used calibration data
# upped to 0.7 b/c why not? more robust?
#Peep into the remaining models
unique(bestmodelsval$modeleval.val...algo...)
## character(0)
#Plot
ggplot(bestmodelsval)+
geom_col(mapping = aes(x = modeleval.val...algo..., y = modeleval.val...calibration...))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
## best models
bestmods <- c("RF")
#calibration
filt.cal<- bestmodelscal %>%
filter(modeleval.val...algo...%in% bestmods)
#validation
filt.val <- bestmodelsval %>%
filter(modeleval.val...algo...%in% bestmods)
#full names of best models
bestmodsfullnames.c = c(filt.cal$modeleval.val...full.name..., filt.val$modeleval.val...full.name...)
bestmodsfullnames.c
## [1] "Acer.rubrum_allData_RUN1_RF" "Acer.rubrum_allData_RUN2_RF"
## [3] "Acer.rubrum_allData_RUN3_RF" "Acer.rubrum_allData_RUN4_RF"
## [5] "Acer.rubrum_allData_RUN5_RF" "Acer.rubrum_allData_RUN6_RF"
## [7] "Acer.rubrum_allData_allRun_RF"
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Model ensemble models
myBiomodEM.c <- BIOMOD_EnsembleModeling(bm.mod = myBiomodModelOut.c, #singles model output
models.chosen = bestmodsfullnames.c, #vector of best models
em.by = 'all', #what models will be combined to ensemble
em.algo = c('EMmean', 'EMcv', 'EMci', 'EMmedian', 'EMca', 'EMwmean'), #types of ensembles models to be computed
metric.select = c('TSS'),
# metric.select.thresh = c(0.7), #don't need this bc double filtering
metric.eval = c('TSS', 'ROC'), #evaluation metrics to filter models
var.import = 3, #num permutationsto est var importance
EMci.alpha = 0.05, #significance level
EMwmean.decay = 'proportional') #relative importance of weights
#Print results
myBiomodEM.c
#get today's date - uncomment to save
# today = format(Sys.Date(), format = "%Y%m%d")
#
# ##save biomodout file
# saveRDS(myBiomodEM.c, file = file.path(L2, myBiomodEM.c@sp.name,
# paste0("biomodensemble_Acer_rubrum_cur_out_", today, ".rds")))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Get evaluation scores & variables importance - can save these as csv
evalensemble <- get_evaluations(myBiomodEM.c)
varimpensemble <- get_variables_importance(myBiomodEM.c)
#Extract kept models in the ensemble
kept_models <- get_kept_models(myBiomodEM.c)
print(kept_models)
## [1] "Acer.rubrum_allData_RUN1_RF" "Acer.rubrum_allData_RUN2_RF"
## [3] "Acer.rubrum_allData_RUN3_RF" "Acer.rubrum_allData_RUN4_RF"
## [5] "Acer.rubrum_allData_RUN5_RF" "Acer.rubrum_allData_RUN6_RF"
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent evaluation scores & variables importance
bm_PlotEvalMean(bm.out = myBiomodEM.c, group.by = 'full.name')
## Warning in .bm_PlotEvalMean.check.args(bm.out, metric.eval, dataset, group.by,
## : ROC, TSS evaluation metric.eval automatically selected
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
bm_PlotEvalBoxplot(bm.out = myBiomodEM.c, group.by = c('full.name', 'full.name'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.c, group.by = c('expl.var', 'full.name', 'full.name'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.c, group.by = c('expl.var', 'algo', 'merged.by.run'))
bm_PlotVarImpBoxplot(bm.out = myBiomodEM.c, group.by = c('algo', 'expl.var', 'merged.by.run'))
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#Response curve
bm_PlotResponseCurves(bm.out = myBiomodEM.c,
models.chosen = get_built_models(myBiomodEM.c)[7],
fixed.var = 'median',
do.bivariate = TRUE)
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# Represent response curves
# Represent response curves
bm_PlotResponseCurves(bm.out = myBiomodEM.c,
models.chosen = get_built_models(myBiomodEM.c)[c(1, 6, 7)],
fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodEM.c,
models.chosen = get_built_models(myBiomodEM.c)[c(1, 5, 6)],
fixed.var = 'median')
bm_PlotResponseCurves(bm.out = myBiomodEM.c,
models.chosen = get_built_models(myBiomodEM.c)[c(1, 6, 7)],
fixed.var = 'min')
Projecting across space using initial ensemble outputs and pre-moeling data.
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
# project species - will pick best one
sp_projection.c <- BIOMOD_EnsembleForecasting(myBiomodEM.c, #output from ensemble
proj.name = myRespName.c,#species name
new.env = clim.c, #enviro matrix
new.env.xy = myRespXY.c,
models.chosen = "all")
plot(sp_projection.c)
#spits lots of rasters
#Saving plots - uncomment to save
# png(file = file.path(L2, paste0("EMFProj_all_Current_", today, ".png")), width = 8, height = 7, units = "in", res=600)
# try(plot(sp_projection.c))
# dev.off()
Choose the ensemble model (one) that looks the ‘best’ to you and re-run on its own - Here we chose the EMca by TSS
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
EMcaTSS.c <- BIOMOD_EnsembleForecasting(myBiomodEM.c,
proj.name = myRespName.c,
new.env = clim.c,
new.env.xy = myRespXY.c,
# binary.meth = "TSS", #how values of output being saved
models.chosen = "Acer.rubrum_EMcaByTSS_mergedData_mergedRun_mergedAlgo", #weighted mean TSS run - to choose how to ensemble them #spits one raster
)
plot(EMcaTSS.c)
[https://biomodhub.github.io/biomod2/articles/vignette_presentation.html]
#set working directory - biomod2 automatically creates a folder
setwd(L2.em)
#read raster output
r = rast(EMcaTSS.c@proj.out@link)
#write ouput with new name -- will be overwritten by the current timeperiod model since the species share the same name
writeRaster(r,
file = file.path(L2, paste0(
gsub("\\.", "_", EMcaTSS.c@sp.name),
"_FinalEnsemble_C_Phenology.tif")), overwrite = TRUE)